sample_raw <- read_csv("data/us1910m_usa_sample100k.csv")
edict <- read_csv("data/full_mn_dict.csv")
sample_raw <- street_match(sample_raw, edict)
sample <- fillDownStreet(sample_raw)
Note: code is the same as Bo’s in_progress_HN_html script, look at that script for more documentation
Preprocessing to filter records with address numbers - note, not filtered by “H” records as this presents difficults downstream with look around (too many NAs).
HN <- sample %>%
fill(microfilm, .direction = "up") %>%
filter(!is.na(house_num)) %>%
select(microfilm, ED,
dwelling_ser, house_num, street_add, best_match) %>%
mutate(originalHN = house_num,
house_num = ifelse(house_num=="", NA, as.character(house_num)),
street_add = ifelse(is.na(street_add), " ", street_add))
Extract house number
HN <- HN %>% rowwise() %>%
mutate(flag1=ifelse(is.na(house_num)&
!is.na(str_extract(street_add,"[0-9]+\\s*(-|TO)+\\s*[0-9]+")),
str_extract(street_add,"[0-9]+\\s*(-|TO)+\\s*[0-9]+"),
NA),
house_num=ifelse(is.na(house_num), flag1, house_num),
flg_hn_from_strt2=ifelse((!is.na(flag1))&(house_num != flag1) , 1, 0),
flg_hn_from_strt=ifelse(is.na(flag1), 0, 1),
house_num = gsub("\\s*(TO)\\s*", "-", house_num, ignore.case = TRUE)) %>%
select(-flag1)
Cleaning The chunk below does:
1/2 or 1-2 from house_num to modifier.number column1/2 or 1-2 from house_numTO and / into -house_num to modifier.number columnhouse_num** I include 1-2 just in case this exists. The regex for this case excludes 11-20 for example.
HN <- HN %>% rowwise() %>%
mutate(modifier.number = str_extract(house_num, "\\s1[/-]2\\b"),
house_num = gsub("\\s1[/-]2\\b", " ", house_num),
house_num = gsub("\\s*(TO|/|&)\\s*", "-", house_num, ignore.case = TRUE),
house_num = gsub("(?<=\\d)\\s+(?=\\d)", "-", house_num, ignore.case = TRUE, perl = TRUE),
modifier.word = trimws(str_extract(house_num, "[A-Za-z\\s]+")),
house_num = gsub("\\s+", " ", house_num),
house_num = trimws(gsub("[A-Za-z\\s]+", "", house_num)),
house_num = gsub("^\\D+", "", house_num, ignore.case = TRUE),
house_num = gsub("\\D+$", "", house_num, ignore.case = TRUE),
flg_cleaned = ifelse(originalHN=="", 0, ifelse(house_num==originalHN, 0, 1)))
Splitting house ranges with “-” First, a look at such house numbers:
HN %>%
filter(str_detect(house_num, "-")) %>%
select(ED, best_match, house_num) %>%
distinct() %>%
kable() %>% kable_styling() %>% scroll_box(height = "300px")
| ED | best_match | house_num |
|---|---|---|
| 0020 | PARK ROW | 177-72 |
| 0021 | ROOSEVELT | 14-16 |
| 0021 | ROOSEVELT | 18-20 |
| 0022 | OLIVER | 31-33 |
| 0023 | OLIVER | 40-38 |
| 0023 | CATHERINE | 40-42 |
| 0023 | MADISON | 67-65 |
| 0023 | OAK | 48-46 |
| 0024 | JAMES | 48-50 |
| 0024 | MADISON | 44-46 |
| 0024 | MADISON | 52-50 |
| 0024 | MADISON | 50-52 |
| 0025 | OAK | 30-32 |
| 0025 | OAK | 30-52 |
| 0025 | OAK | 21-28 |
| 0026 | CHERRY | 108-110 |
| 0026 | OAK | 45-47 |
| 0026 | CATHERINE | 88-90 |
| 0026 | CHERRY | 108-10 |
| 0027 | OLIVER | 81-83 |
| 0029 | OLIVER | 100-102 |
| 0029 | CHERRY | 03-103 |
| 0045 | MULBERRY | 22-24 |
| 0046 | MOTT | 34-36-38 |
| 0046 | MOTT | 54-36-38 |
| 0046 | CHATHAM SQUARE | 7-8 |
| 0048 | MULBERRY | 48-50 |
| 0048 | MULBERRY | 54-56 |
| 0048 | MULBERRY | 50-56 |
| 0048 | MULBERRY | 36-38 |
| 0051 | MOTT | 52-56 |
| 0127 | JONES | 32-34 |
| 0127 | JONES | 32-32 |
| 0127 | 4 W | 170-168 |
| 0127 | CORNELIA | 25-27 |
| 0127 | CORNELIA | 29-29 |
| 0127 | BLEECKER | 168-170 |
| 0127 | 4 W | 169-70 |
| 0127 | W 4 | 168-170 |
| 0128 | BEDFORD | 72-74 |
| 0128 | BLEECKER | 272-78 |
| 0129 | MORTON | 10-12 |
| 0129 | MORTON | 10-32 |
| 0130 | BEDFORD | 47-9 |
| 1654 | 99 E | 311-313 |
| 1654 | E 99 | 311-313 |
| 1654 | FIR AVE | 37-1939 |
| 1654 | FIR AVE | 37-1339 |
| 1654 | FIR AVE | 41-1943 |
| 1654 | FIR AVE | 45-1947 |
| 1654 | 2 AVE | 24-1926 |
| 1654 | 2 AVE | 28-1930 |
| 1654 | 2 AVE | 32-1934 |
| 1654 | 100 E | 302-304 |
| 1654 | 100 E | 306-308 |
| 1654 | 100 E | 318-320 |
| 1654 | 100 E | 322-324 |
| 1654 | 100 E | 326-328 |
| 1654 | 100 E | 330-332 |
| 1654 | 100 E | 336-338 |
| 1654 | 100 E | 342-344 |
| 1654 | E 100 | 318-320 |
| 0354 | 3 AVE | 67-1769 |
| 0354 | E 99 | 204-206 |
| 0354 | E 99 | 24-2269 |
| 0354 | 2 AVE | 230-2 |
| 0354 | 98 E | 289-291 |
| 0354 | 98 E | 227-229 |
| 0354 | 98 E | 217-219 |
| 0356 | 98 E | 232-234 |
| 0357 | E 96 | 199-201 |
| 0357 | E 96 | 203-205 |
| 0357 | E 96 | 207-209 |
| 0357 | 3 AVE | 1713-1715 |
| 0357 | 3 AVE | 713-1715 |
| 0358 | E 95 | 332-334 |
| 0358 | FIR AVE | 23-1825 |
| 0358 | E 94 | 345-347 |
| 0358 | E 94 | 341-343 |
| 0358 | E 94 | 337-339 |
| 0358 | E 94 | 333-335 |
| 0358 | E 94 | 329-331 |
| 0358 | E 95 | 316-317 |
| 0358 | E 95 | 304-311 |
| 0358 | E 95 | 328-330 |
| 0358 | E 94 | 335-333 |
| 0358 | E 95 | 309-311 |
| 0359 | E 96 | 210-212 |
| 0363 | 93 E | 314-312 |
| 0364 | FIR AVE | 1775-3-10 |
| 0364 | 1 AVE | 1775-1 |
One notable problem is that some number ranges are incomplete, e.g. 177-72. To fix house number ranges, we will use a code that splits house numbers by “-”, and appends the leading n numbers to the front for house numbers that are of different length post-split. This is subject to the constraint that the resulting number will have the same length.
An example of how this works on actual numbers observed above:
14-16 > split > 14-16 > detect length > 2, 2 > No further action! 177-72 > split > 177, 72 > detect length > 3, 2 > append front 1 (3 - 2) chr > 177, 172 1775-3-10 > split > 1775, 3, 10 > detect length > 4, 1, 12 > for 3: append front 3 (4 - 1) chr > 1773 for 10: no appending because if append resulting string = 17710 > 10
hn_range_clean <- function(hn_range) {
hn <- hn_range %>%
str_split("-") %>% unlist()
same_length <- str_length(hn) %>%
unique() %>%
length() == 1
if (!same_length) {
hn <- as.integer(hn)
max_hn <- as.character(max(hn))
min_hn <- as.character(min(hn))
len_diff <- str_length(max_hn) - str_length(min_hn)
clean <- map_chr(hn, function(x) ifelse(str_length(x) < str_length(max_hn),
ifelse(str_length(paste0(str_sub(max_hn, 1, len_diff), x)) == str_length(max_hn),
paste0(str_sub(max_hn, 1, len_diff), x),
x),
x)
)
hn_range <- str_c(clean, collapse = "-")
}
return(hn_range)
}
Before and after cleaning for cleaned house num ranges:
HN %>%
filter(str_detect(house_num, "-")) %>%
mutate(hn_cleaned = hn_range_clean(house_num)) %>%
filter(house_num != hn_cleaned) %>%
select(ED, best_match, house_num, hn_cleaned) %>%
distinct() %>%
kable() %>% kable_styling() %>%
row_spec(19, background = "#ff8080") %>% scroll_box(height = "300px")
| ED | best_match | house_num | hn_cleaned |
|---|---|---|---|
| 0020 | PARK ROW | 177-72 | 177-172 |
| 0026 | CHERRY | 108-10 | 108-110 |
| 0029 | CHERRY | 03-103 | 103-103 |
| 0127 | 4 W | 169-70 | 169-170 |
| 0128 | BLEECKER | 272-78 | 272-278 |
| 0130 | BEDFORD | 47-9 | 47-49 |
| 1654 | FIR AVE | 37-1939 | 1937-1939 |
| 1654 | FIR AVE | 37-1339 | 1337-1339 |
| 1654 | FIR AVE | 41-1943 | 1941-1943 |
| 1654 | FIR AVE | 45-1947 | 1945-1947 |
| 1654 | 2 AVE | 24-1926 | 1924-1926 |
| 1654 | 2 AVE | 28-1930 | 1928-1930 |
| 1654 | 2 AVE | 32-1934 | 1932-1934 |
| 0354 | 3 AVE | 67-1769 | 1767-1769 |
| 0354 | E 99 | 24-2269 | 2224-2269 |
| 0354 | 2 AVE | 230-2 | 230-232 |
| 0357 | 3 AVE | 713-1715 | 1713-1715 |
| 0358 | FIR AVE | 23-1825 | 1823-1825 |
| 0364 | FIR AVE | 1775-3-10 | 1775-1773-10 |
| 0364 | 1 AVE | 1775-1 | 1775-1771 |
The 19th row is highlighted as the output of clean is still wrong. It is currently not addressed as the I am not sure what the house number should actually be.
Code to actually create the separated house numbers:
HN <- HN %>%
mutate(hn_range = hn_range_clean(house_num)) %>%
mutate(flg_cleaned = ifelse(house_num == hn_range, 0, 1))
splt_df <- str_split_fixed(HN$hn_range, pattern = "-", n = 3) %>% data.frame() %>%
rename(hn_1 = X1, hn_2 = X2, hn_3 = X3) %>%
mutate_all(as.character) %>% mutate_all(as.numeric)
HN <- HN %>% cbind(splt_df)
Basic Principle: if the first non-NA above and first non-NA below has a difference of 0 or 2, AND the ED of those are the same, AND the street address are the same, fill down.
approach_1 <- HN %>%
# fix street_add
mutate(street_add = as.character(street_add)) %>%
# create copy of the columns to filled up/down - these should only be filled for records with house no.s
mutate(above_hn = hn_1, below_hn = hn_1) %>%
mutate(above_ED = ifelse(!is.na(hn_1), ED, NA), below_ED = ifelse(!is.na(hn_1), ED, NA)) %>%
mutate(above_add = ifelse(!is.na(hn_1), street_add, NA),
below_add = ifelse(!is.na(hn_1), street_add, NA)) %>%
# fill
fill(above_hn, .direction = "down") %>%
fill(below_hn, .direction = "up") %>%
fill(above_ED, .direction = "down") %>%
fill(below_ED, .direction = "up") %>%
fill(above_add, .direction = "down") %>%
fill(below_add, .direction = "up") %>%
# can we confidently fill up/down?
mutate(yes_fill = ifelse(abs(above_hn - below_hn) %in% c(0, 2)
& above_ED == below_ED & above_ED == ED
& above_add == below_add & above_add == street_add
& is.na(hn_2),
1, 0))
# how many confident fill downs?
sum(approach_1$yes_fill)
sum(approach_1$yes_fill) - sum(!is.na(approach_1$hn_1))
Using this method, 5806 additional records have been recovered (with confidence), i.e. we have 10660 confident house numbers. This number is likely to increase with cleaned street addresses.
Code to actually do the fill down (above is just EDA). Chunk set to eval = FALSE for now. Run this code instead of chunk above.
HN2 <- HN %>%
# create copy of the columns to filled up/down - these should only be filled for records with house no.s
mutate(above_hn = hn_1, below_hn = hn_1) %>%
mutate(above_ED = ifelse(!is.na(hn_1), ED, NA), below_ED = ifelse(!is.na(hn_1), ED, NA)) %>%
mutate(above_add = ifelse(!is.na(hn_1), best_match, NA),
below_add = ifelse(!is.na(hn_1), best_match, NA)) %>%
# fill
fill(above_hn, .direction = "down") %>%
fill(below_hn, .direction = "up") %>%
fill(above_ED, .direction = "down") %>%
fill(below_ED, .direction = "up") %>%
fill(above_add, .direction = "down") %>%
fill(below_add, .direction = "up") %>%
# replace hn_1 column
mutate(hn_1 = ifelse(abs(above_hn - below_hn) %in% c(0, 2)
& above_ED == below_ED & above_ED == ED
& above_add == below_add & above_add == best_match
& is.na(hn_2),
above_hn, hn_1)) %>%
# clean
select(- c(above_hn, below_hn, above_ED, below_ED, above_add, below_add))
We use the street dictionary with house numbers (hn_dict) to verify which house numbers are completely not within the range indicated by the street dictionary.
Import and wrangle hn_dict to be appropriate for our use.
hn_dict <- read_csv("data/combine_mn.csv")
# some of the highs are lower than low; define function below to fix this
fix_order <- function(str) {
if (str == "NA?NA") {return (NA_character_)}
str_split(str, "\\?") %>%
unlist() %>%
as.integer() %>%
sort() %>%
str_c(collapse = "?")
}
hn_dict <- hn_dict %>%
mutate(ED = str_pad(ED, 4, "left", pad = "0")) %>%
select(ED, Name, Left_Low, Left_High, Right_Low, Right_High) %>%
unite(col = "Left", Left_Low, Left_High, sep = "?") %>%
unite(col = "Right", Right_Low, Right_High, sep = "?") %>%
rowwise() %>%
mutate(Left = fix_order(Left), Right = fix_order(Right)) %>%
ungroup() %>%
gather(key = "column", value = "value", -c(ED, Name)) %>%
separate(col = value, into = c("Low", "High"), sep = "\\?") %>%
mutate(Low = as.integer(Low), High = as.integer(High))
EDA of house numbers beyond range dictated by hn_dict
large_HN <- HN %>%
left_join(hn_dict, by = c("ED" = "ED", "best_match" = "Name"))
large_HN %>%
ungroup() %>%
mutate(within = ifelse(hn_1 < Low | hn_1 > High, FALSE, TRUE)) %>%
mutate(range = paste0(Low, "-", High)) %>%
group_by(ED, best_match, hn_1) %>%
summarize(n_out = sum(within), indicative_range = first(range)) %>%
filter(n_out == 0) %>%
select(- n_out) %>%
kable() %>% kable_styling() %>% scroll_box(height = "300px")
| ED | best_match | hn_1 | indicative_range |
|---|---|---|---|
| 0020 | PEARL | 3 | 429-447 |
| 0021 | PARK ROW | 4 | 186-192 |
| 0021 | ROOSEVELT | 73 | 2-36 |
| 0022 | JAMES | 36 | 22-30 |
| 0022 | JAMES | 37 | 22-30 |
| 0022 | OLIVER | 9 | 15-33 |
| 0022 | OLIVER | 11 | 15-33 |
| 0022 | OLIVER | 13 | 15-33 |
| 0023 | MADISON | 52 | 57-67 |
| 0023 | MADISON | 6765 | 57-67 |
| 0023 | OAK | 18 | 48-58 |
| 0024 | JAMES | 41 | 46-72 |
| 0024 | JAMES | 554 | 46-72 |
| 0024 | OAK | 36 | 38-46 |
| 0024 | OLIVER | 68 | 35-55 |
| 0024 | OLIVER | 70 | 35-55 |
| 0025 | JAMES | 26 | 46-72 |
| 0025 | JAMES | 28 | 46-72 |
| 0026 | CATHERINE | 96 | 80-94 |
| 0026 | CHERRY | 1096 | 100-114 |
| 0026 | OAK | 45 | 48-58 |
| 0027 | OLIVER | 97 | 59-81 |
| 0027 | OLIVER | 818 | 59-81 |
| 0029 | CATHERINE SLIP | 369 | 12-26 |
| 0037 | CITY HALL PL | 38040 | 18-44 |
| 0038 | FRANKLIN | 89 | 65-67 |
| 0038 | FRANKLIN | 518 | 65-67 |
| 0038 | PEARL | 22 | 535-557 |
| 0038 | WALKER | 16 | 68-76 |
| 0038 | WALKER | 37 | 68-76 |
| 0038 | WALKER | 995 | 68-76 |
| 0038 | WALKER | 44101 | 68-76 |
| 0040 | CANAL | 116 | 222-238 |
| 0040 | FRANKLIN | 22 | 1-17 |
| 0040 | LEONARD | 115 | 141-169 |
| 0040 | WHITE | 0 | 115-116 |
| 0040 | WHITE | 5 | 115-116 |
| 0040 | WHITE | 8 | 115-116 |
| 0040 | WHITE | 12 | 115-116 |
| 0040 | WHITE | 16 | 115-116 |
| 0040 | WHITE | 18 | 115-116 |
| 0040 | WHITE | 94 | 115-116 |
| 0040 | WHITE | 118 | 115-116 |
| 0041 | LEONARD | 170 | 141-169 |
| 0044 | BAXTER | 0 | 2-24 |
| 0044 | BAXTER | 28 | 2-24 |
| 0044 | WORTH | 9 | 171-181 |
| 0044 | WORTH | 25 | 171-181 |
| 0044 | WORTH | 196 | 171-181 |
| 0046 | BOWERY | 28 | 2-18 |
| 0046 | DOYERS | 18 | 1-17 |
| 0046 | DOYERS | 19 | 1-17 |
| 0046 | DOYERS | 507 | 1-17 |
| 0046 | DOYERS | 517 | 1-17 |
| 0046 | MOTT | 54 | 1-21 |
| 0049 | CANAL | 278 | 206-220 |
| 0050 | MOTT | 11 | 55-85 |
| 0050 | MOTT | 13 | 55-85 |
| 0050 | MOTT | 7375 | 55-85 |
| 0050 | MULBERRY | 32 | 71-92 |
| 0050 | MULBERRY | 1850 | 71-92 |
| 0051 | BAYARD | 10 | 57-73 |
| 0051 | CANAL | 7 | 150-162 |
| 0051 | MOTT | 7 | 55-85 |
| 0051 | MOTT | 8 | 55-85 |
| 0051 | MOTT | 28 | 55-85 |
| 0051 | MOTT | 174 | 55-85 |
| 0126 | BARROW | 65 | 37-63 |
| 0126 | BLEECKER | 587 | 280-294 |
| 0126 | COMMERCE | 0 | 2-30 |
| 0127 | BLEECKER | 168 | 272-278 |
| 0127 | CORNELIA | 37 | 2-36 |
| 0127 | CORNELIA | 7527 | 2-36 |
| 0128 | BLEECKER | 280 | 272-278 |
| 0128 | BLEECKER | 282 | 272-278 |
| 0128 | BLEECKER | 284 | 272-278 |
| 0130 | BEDFORD | 47 | 79-83 |
| 0130 | BEDFORD | 61 | 79-83 |
| 0130 | BEDFORD | 63 | 79-83 |
| 0130 | BEDFORD | 65 | 79-83 |
| 0130 | BEDFORD | 67 | 79-83 |
| 0130 | BEDFORD | 69 | 79-83 |
| 0130 | BEDFORD | 71 | 79-83 |
| 0130 | BEDFORD | 75 | 79-83 |
| 0130 | HUDSON | 71 | 447-463 |
| 0130 | HUDSON | 77 | 447-463 |
| 0130 | HUDSON | 81 | 447-463 |
| 0130 | HUDSON | 150 | 447-463 |
| 0130 | MORTON | 23 | 29-67 |
| 0132 | LEROY | 66 | 36-64 |
| 0133 | DOWNING | 246 | 34-68 |
| 0352 | 2 AVE | 225 | 1939-1961 |
| 0352 | 2 AVE | 1851 | 1939-1961 |
| 0352 | 2 AVE | 1905 | 1939-1961 |
| 0352 | 3 AVE | 17 | 1802-1822 |
| 0352 | 3 AVE | 811 | 1802-1822 |
| 0353 | 2 AVE | 199 | 1915-1937 |
| 0354 | E 99 | 200 | 201-241 |
| 0354 | E 99 | 2224 | 201-241 |
| 0356 | 2 AVE | 206 | 1881-1895 |
| 0356 | 2 AVE | 208 | 1881-1895 |
| 0356 | 2 AVE | 210 | 1881-1895 |
| 0356 | 2 AVE | 212 | 1881-1895 |
| 0356 | 2 AVE | 214 | 1881-1895 |
| 0356 | 2 AVE | 220 | 1881-1895 |
| 0356 | 2 AVE | 222 | 1881-1895 |
| 0356 | 2 AVE | 225 | 1881-1895 |
| 0356 | 2 AVE | 227 | 1881-1895 |
| 0356 | 2 AVE | 233 | 1881-1895 |
| 0356 | 2 AVE | 235 | 1881-1895 |
| 0356 | 2 AVE | 250 | 1881-1895 |
| 0356 | 2 AVE | 1833 | 1881-1895 |
| 0356 | E 97 | 200 | 201-237 |
| 0357 | 3 AVE | 1892 | 1710-1724 |
| 0357 | E 97 | 200 | 201-237 |
| 0358 | 2 AVE | 1812 | 1841-1863 |
| 0359 | 2 AVE | 1888 | 1841-1863 |
| 0359 | 3 AVE | 677 | 1694-1708 |
| 0360 | 3 AVE | 1889 | 1678-1692 |
| 0360 | E 94 | 568 | 201-249 |
| 0360 | E 95 | 328 | 201-239 |
| 0361 | 2 AVE | 125 | 1801-1815 |
| 0361 | 2 AVE | 239 | 1801-1815 |
| 0361 | 3 AVE | 200 | 1662-1676 |
| 0361 | 3 AVE | 236 | 1662-1676 |
| 0361 | 3 AVE | 248 | 1662-1676 |
| 0361 | 3 AVE | 1009 | 1662-1676 |
| 0361 | E 93 | 2443 | 201-247 |
| 0362 | 2 AVE | 180 | 1801-1815 |
| 0362 | 2 AVE | 308 | 1801-1815 |
| 0364 | 3 AVE | 164 | 1644-1660 |
| 0364 | 3 AVE | 1756 | 1644-1660 |
| 0365 | FIR AVE | 341 | 1739-1753 |
| 0366 | E 89 | 211 | 301-355 |
| 0366 | E 89 | 229 | 301-355 |
| 0366 | FIR AVE | 1753 | 1721-1737 |
| 0682 | CHERRY | 9092 | 80-98 |
| 0682 | JAMES | 102 | 87-99 |
| 1654 | FIR AVE | 1337 | 1927-1953 |
There are 139 records out of the bounds of the house number range.
Tailor the code to specifically look for large house numbers. The conditions for this:
- Length of house number is larger than 4
- Length of house number is not the same as low and high house number range from hn_dict (as large numbers may exist for certain streets)
- However, the separated house number would be within the house number range from hn_dict
large_HN %>%
filter(str_length(hn_1) >= 4) %>%
filter(str_length(hn_1) != str_length(Low) & str_length(hn_1) != str_length(High)) %>%
mutate(hn1_first_2 = as.integer(str_sub(hn_1, 1, 2))) %>%
mutate(hn1_last_2 = as.integer(str_sub(hn_1, str_length(hn_1) - 1, str_length(hn_1)))) %>%
filter(hn1_first_2 >= Low & hn1_first_2 <= High & hn1_last_2 >= Low & hn1_last_2 <= High) %>%
select(house_num, hn_1, hn_2, hn_3, Low, High) %>%
kable() %>% kable_styling()
| house_num | hn_1 | hn_2 | hn_3 | Low | High |
|---|---|---|---|---|---|
| 6765 | 6765 | NA | NA | 57 | 67 |
| 6765 | 6765 | NA | NA | 58 | 72 |
| 9092 | 9092 | NA | NA | 80 | 98 |
| 9092 | 9092 | NA | NA | 81 | 99 |
| 38040 | 38040 | NA | NA | 18 | 44 |
| 7375 | 7375 | NA | NA | 55 | 85 |
| 7375 | 7375 | NA | NA | 52 | 80 |
Code to actually split the house num:
large_HN <- large_HN %>%
mutate(hn1_first_2 = as.integer(str_sub(house_num, 1, 2)),
hn1_last_2 = as.integer(str_sub(house_num, str_length(house_num) - 1, str_length(house_num)))) %>%
mutate(hn_1 = ifelse(str_length(house_num) != str_length(Low)
& str_length(house_num) != str_length(High)
& hn1_first_2 >= Low & hn1_first_2 <= High
& hn1_last_2 >= Low & hn1_last_2 <= High,
hn1_first_2, hn_1),
hn_2 = ifelse(str_length(house_num) != str_length(Low)
& str_length(house_num) != str_length(High)
& hn1_first_2 >= Low & hn1_first_2 <= High
& hn1_last_2 >= Low & hn1_last_2 <= High,
hn1_last_2, hn_2)) %>%
mutate(hn_1 = ifelse(is.na(hn_1), house_num, hn_1))
## Warning: NAs introduced by coercion
The original census manuscripts were then referred to in order to understand what the mistakes are.
Here, 54 was clearly written in the manuscript but somehow mistranscribed into 554.
9092 was clearly a house number range, especially since it’s in between 86 and 94.
210
Here, the odd house numbers was due to the census taker going back to houses that he failed to visit (e.g. if no one was home), hence the sequence of numbers is very random.
0
Here, 17 was written but mistranscribed into 0.
Some ‘large’ house numbers are improperly cleaned house ranges. For example, 177-72 probably means 177-172, but the code would instead separate the range into 177-72.
Further EDA on errors detected by house number ranges, nothing that these may instead reflect bad matches derived from the address cleaning process.